This R Markdown serves as a general tutorial for data visualization and statistical analyses of common datasets in the Allen Lab. RMarkdown file can be downloaded directly from the HTML file (Click ‘Code’ on the top right corner). Contact me (Jilly) with any questions or comments regarding the code or R/RStudio.
These data are from Isabel’s APP/PS1 Synaptic Stainings. Below is some general information regarding the dataset. Address questions regarding the data itself to Isabel:
The first step is to install and load required packages. Visit https://www.datacamp.com/community/tutorials/r-packages-guide for more information about packages and package repositories. You only need to install packages once - however, you need to keep track of versions and you may need to update packages depending on new releases.
knitr::opts_chunk$set(fig.width = 5, fig.height = 5)
# one option
install.packages(packageName)
# another option, this specifies the specific repo where the package lives
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(packageName)
install.packages("multcomp")
library(car)
library(ggplot2)
library(tidyr)
library(viridis)
library(multcomp)
library(magrittr)
library(plotly)
library(ggpubr)
You want your data to have your independent and dependent variables in separate columns. Each sample should be a separate row.
Tips:
data_synapse <- read.csv("Isa_synapse_test.csv" , header = TRUE)
data_synapse$Group <- factor(data_synapse$Group, levels = c("WT", "APP")) #releveling the "Group" Variable so that WT is 1st
data_synapse$Pair <- as.factor(data_synapse$Pair)
summary(data_synapse) #descriptive stats/result summaries of variables in your data.frame
## SampleName Group Pair Bassoon Homer
## Length:30 WT :15 1:6 Min. :1942 Min. : 2187
## Class :character APP:15 2:6 1st Qu.:3106 1st Qu.: 7939
## Mode :character 3:6 Median :4238 Median :10678
## 4:6 Mean :4146 Mean :10573
## 5:6 3rd Qu.:5033 3rd Qu.:13348
## Max. :6161 Max. :16915
##
## Colocalization Region NormalizedbyPair AverageWTbyPair
## Min. : 339.0 Length:30 Min. :0.3519 Min. : 579.7
## 1st Qu.: 689.5 Class :character 1st Qu.:0.6881 1st Qu.: 924.0
## Median : 946.0 Mode :character Median :0.8522 Median : 978.0
## Mean :1082.2 Mean :0.9159 Mean :1239.2
## 3rd Qu.:1372.8 3rd Qu.:1.0879 3rd Qu.:1736.3
## Max. :2872.0 Max. :2.1236 Max. :1978.0
## NA's :25
head(data_synapse) #first few rows of your data.frame
## SampleName Group Pair Bassoon Homer Colocalization Region NormalizedbyPair
## 1 WT1.1 WT 1 4838 16135 1222 CA1 1.3225108
## 2 WT1.2 WT 1 3480 13353 864 CA1 0.9350649
## 3 WT1.3 WT 1 3052 10953 686 CA1 0.7424242
## 4 APP1.1 APP 1 3610 9665 792 CA1 0.8571429
## 5 APP1.2 APP 1 5068 16915 1379 CA1 1.4924242
## 6 APP1.3 APP 1 2966 7042 475 CA1 0.5140693
## AverageWTbyPair
## 1 924
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
#One way of calculating summarys stats
with(data_synapse, tapply(Colocalization, Group, function(x) {
sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
## WT APP
## "M (SD) = 1239.20 (657.30)" "M (SD) = 925.27 (439.09)"
#Another way of calculating summary stats, use 'sd' and 'coloc' for plotting error bars below
summary.stats <- data_synapse %>%
group_by(Group) %>%
summarise(
sd = sd(Colocalization, na.rm = TRUE),
coloc = mean(Colocalization)
)
print(summary.stats)
## # A tibble: 2 x 3
## Group sd coloc
## <fct> <dbl> <dbl>
## 1 WT 657. 1239.
## 2 APP 439. 925.
#Double colons :: specify that a specific function should be used from a specific package
knitr::kable(data_synapse, caption = "Synapse Quantification")
| SampleName | Group | Pair | Bassoon | Homer | Colocalization | Region | NormalizedbyPair | AverageWTbyPair |
|---|---|---|---|---|---|---|---|---|
| WT1.1 | WT | 1 | 4838 | 16135 | 1222 | CA1 | 1.3225108 | 924.0000 |
| WT1.2 | WT | 1 | 3480 | 13353 | 864 | CA1 | 0.9350649 | NA |
| WT1.3 | WT | 1 | 3052 | 10953 | 686 | CA1 | 0.7424242 | NA |
| APP1.1 | APP | 1 | 3610 | 9665 | 792 | CA1 | 0.8571429 | NA |
| APP1.2 | APP | 1 | 5068 | 16915 | 1379 | CA1 | 1.4924242 | NA |
| APP1.3 | APP | 1 | 2966 | 7042 | 475 | CA1 | 0.5140693 | NA |
| WT2.2 | WT | 2 | 5640 | 16696 | 1708 | CA1 | 0.8634985 | 1978.0000 |
| WT2.3 | WT | 2 | 4073 | 13333 | 1354 | CA1 | 0.6845298 | NA |
| WT2.4 | WT | 2 | 4194 | 13207 | 2872 | CA1 | 1.4519717 | NA |
| APP2.1 | APP | 2 | 6082 | 15855 | 1853 | CA1 | 0.9368049 | NA |
| APP2.2 | APP | 2 | 4281 | 10654 | 1042 | CA1 | 0.5267947 | NA |
| APP2.4 | APP | 2 | 5883 | 15394 | 1676 | CA1 | 0.8473205 | NA |
| WT3.1 | WT | 3 | 5685 | 11966 | 1949 | CA1 | 1.1224803 | 1736.3333 |
| WT3.2 | WT | 3 | 5073 | 10183 | 1551 | CA1 | 0.8932617 | NA |
| WT3.3 | WT | 3 | 5051 | 10500 | 1709 | CA1 | 0.9842580 | NA |
| APP3.1 | APP | 3 | 3460 | 8412 | 849 | CA1 | 0.4889614 | NA |
| APP3.2 | APP | 3 | 2445 | 6067 | 611 | CA1 | 0.3518910 | NA |
| APP3.3 | APP | 3 | 2767 | 7796 | 700 | CA1 | 0.4031484 | NA |
| WT4.1 | WT | 4 | 4879 | 15295 | 1188 | CA1 | 1.2147239 | 978.0000 |
| WT4.3 | WT | 4 | 3270 | 11566 | 638 | CA1 | 0.6523517 | NA |
| WT4.4 | WT | 4 | 4742 | 16801 | 1108 | CA1 | 1.1329243 | NA |
| APP4.1 | APP | 4 | 4291 | 10392 | 755 | CA1 | 0.7719836 | NA |
| APP4.2 | APP | 4 | 3723 | 10703 | 718 | CA1 | 0.7341513 | NA |
| APP4.3 | APP | 4 | 4365 | 13023 | 960 | CA1 | 0.9815951 | NA |
| WT5.1 | WT | 5 | 4978 | 6086 | 932 | CA1 | 1.6078206 | 579.6667 |
| WT5.2 | WT | 5 | 2890 | 3485 | 468 | CA1 | 0.8073606 | NA |
| WT5.3 | WT | 5 | 1942 | 2187 | 339 | CA1 | 0.5848189 | NA |
| APP5.1 | APP | 5 | 6161 | 8368 | 1231 | CA1 | 2.1236343 | NA |
| APP5.2 | APP | 5 | 2752 | 2549 | 405 | CA1 | 0.6986774 | NA |
| APP5.3 | APP | 5 | 2741 | 2620 | 433 | CA1 | 0.7469810 | NA |
#A histogram
hist(data_synapse$Colocalization, breaks = 10)
#A histogram of just the WT group
hist((subset(data_synapse, Group == "WT")$Colocalization), breaks = 10, main = "Histogram of WT Coloc")
#A histogram of just the APP group
hist((subset(data_synapse, Group == "APP")$Colocalization), breaks = 10, main = "Histogram of APP Coloc")
#alternative way to read in your data for plotting
#with(data_synapse, hist(Colocalization))
#A scatter plot
plot(data_synapse$Homer, data_synapse$Bassoon)
#A boxplot
boxplot(Colocalization ~ Group+Pair, data = data_synapse, xlab = "Group", ylab = "Colocalization")
ggplot is an extremely useful plotting package. Visit https://github.com/z3tt/ggplot-courses for tutorials and presentations to get starated with ggplot. Example Graphs using ggplot
The first function in building a graph is the ggplot function. It specifies the
#the most basic violin plot, including all individual points
ggplot(data_synapse, aes(y=Colocalization, x=Group)) + geom_violin() + geom_point()
#basic histogram
ggplot(data_synapse, aes(Colocalization, fill = Group)) +
geom_histogram(binwidth = 150)
#Density plots
ggplot(data_synapse, aes(x=Colocalization, fill=Group)) +
geom_density()
ggplot(data_synapse, aes(x = Colocalization, fill = Group)) +
geom_density(alpha = 0.3) +
theme_pubr()
#Density plots with histogram
ggplot(data_synapse, aes(x=Colocalization, fill=Group)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5, binwidth = 300) +
geom_density(alpha = 0.3) +
theme_pubr()
#only plotting a subset of your data - in this case, the data from Pair 3
ggplot(data_synapse[data_synapse$Pair %in% c("3"),], aes(x=Colocalization, fill=Group)) +
geom_density(alpha = 0.3) +
theme_classic()
#Bar graph
ggplot(data_synapse) +
geom_bar(aes(Group, Colocalization, fill= Group), position = "dodge", stat = "summary") +
theme_classic()
## No summary function supplied, defaulting to `mean_se()`
#Bar graph with individual points and error bars
ggplot(summary.stats, aes(Group, coloc)) +
geom_bar(aes(fill= Group),stat = "identity") +
geom_errorbar(aes(ymin = coloc-sd, ymax = coloc+sd), width = 0.1) +
theme_classic()
#A violin plot APP vs WT, including individual points
ggplot(data_synapse, aes(y= Colocalization, x= Group, fill = Group)) +
geom_violin() +
geom_point() +
theme_minimal()
#A violin plot APP vs WT, including individual points, log scaling the y-axis
ggplot(data_synapse, aes(y= Colocalization, x= Group, fill = Group)) +
geom_violin() +
geom_point() +
theme_minimal()+
scale_y_log10() +
coord_trans(y = "log10")
#An interactive violin plot APP vs WT, including individual points
p <- ggplot(data_synapse, aes(y= Colocalization, x= Group, fill = Group)) +
geom_violin() +
geom_point(aes(fill = Pair, size = 2)) +
theme_minimal()
ggplotly(p)
#A basic violin plot separated by Pair
ggplot(data_synapse, aes(y= Colocalization, x= as.factor(Pair), fill = Group)) +
geom_violin()
#A boxplot with modified plot parameters
data_synapse %>%
ggplot(aes(x=Group, y=Colocalization, fill = Group)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_classic() +
theme(legend.position="right",plot.title = element_text(size=11)) +
ggtitle("Boxplot of Synapse Marker Colocalization between APP vs WT") +
xlab("")
#A boxplot with modified plot parameters, including individual points
data_synapse %>%
ggplot(aes(x=Group, y=Colocalization)) +
geom_boxplot() +
geom_dotplot(aes(fill = Pair),
binaxis='y', stackdir='center', dotsize = 1,
position = position_dodge(0.3)) +
scale_fill_manual(values = c("darkolivegreen", "darkorchid1", "cadetblue1", "cyan2", "violetred2"))+
theme_classic() +
theme(legend.position="right",plot.title = element_text(size=12)) +
ggtitle("Boxplot of Synapse Marker Colocalization between APP vs WT") +
xlab("")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
#A boxplot with modified plot parameters, plotting Homer instead of Coloc
data_synapse %>%
ggplot(aes(x=Group, y=Homer, fill = Group)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_classic() +
theme(legend.position="right",plot.title = element_text(size=11)) +
ggtitle("Boxplot of Synapse Marker Colocalization between APP vs WT") +
xlab("")
#A boxplot, separated by Pair, with modified plot parameters
data_synapse %>%
ggplot(aes(x=as.factor(Pair), y=Colocalization, fill=Group)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_classic(base_size = 11) +
theme(legend.position="right", plot.title = element_text(size=14), plot.caption = element_text(hjust = 0)) +
labs(title = "Synapse Quantification WT vs APP", caption = "You can insert a caption like this")+
xlab("Pair")
#A boxplot, separated by pair, using the Normalized-to-WT Values
data_synapse %>%
ggplot(aes(x=as.factor(Pair), y=NormalizedbyPair, fill=Group)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_classic(base_size = 11) +
theme(legend.position="right", plot.title = element_text(size=14), plot.caption = element_text(hjust = 0)) +
labs(title = "Synapse Quantification WT vs APP", caption = "You can insert a caption like this")+
xlab("Pair")
I highly recommend the Points of Signifance Collection from Nature Methods. Many factors should be considered when choosing a significance test for your data. You should also think about these factors and what tests you will preform before completing the experiment. A good understanding of 1) why you are choosing a particular test and 2) what are the limitations (or probability of types of errors) is critical.
If your data are not normally distributed, you will need to choose a non-parametric test. You can also consider log transforming your data (or do some other transformation)
#Do the points fall approximately along this reference line? If yes, you can assume normality.
ggqqplot(data_synapse$Colocalization)
#Is the p-value greater than 0.05? If yes, you can assume normality
shapiro.test(data_synapse$Colocalization)
##
## Shapiro-Wilk normality test
##
## data: data_synapse$Colocalization
## W = 0.91628, p-value = 0.0215
#Normality test for each group separately - APP
shapiro.test(subset(data_synapse, Group == "APP")$Colocalization)
##
## Shapiro-Wilk normality test
##
## data: subset(data_synapse, Group == "APP")$Colocalization
## W = 0.91354, p-value = 0.1534
#Normality test for each group separately - WT
shapiro.test(subset(data_synapse, Group == "WT")$Colocalization)
##
## Shapiro-Wilk normality test
##
## data: subset(data_synapse, Group == "WT")$Colocalization
## W = 0.9431, p-value = 0.423
#Is the p-value greater than 0.05? If yes, you can assume there is no significant difference between the two variances.
#Only use F test if your populations are normal
var.test(Colocalization ~ Group, data = data_synapse)
##
## F test to compare two variances
##
## data: Colocalization by Group
## F = 2.2408, num df = 14, denom df = 14, p-value = 0.1433
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7523175 6.6745485
## sample estimates:
## ratio of variances
## 2.240843
| Methods | R function | Description |
|---|---|---|
| T-test | t.test() | Compare two groups (parametric) |
| Mann Whitney/Wilcoxon Rank Sum | wilcox.test() | Compare two groups (non-parametric) |
| ANOVA | aov() or anova() | Compare multiple groups (parametric) |
| Kruskal-Wallis | kruskal.test() | Compare multiple groups (non-parametric) |
You can also add your results from your significance test to your graphs. See some examples below:
#Create a boxplot of summarized data across pairs
p <- data_synapse %>%
ggplot(aes(x=Group, y=Colocalization, fill = Group)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_classic() +
theme(legend.position="right",plot.title = element_text(size=11)) +
xlab("")
#add stats results to plot 'p'
p + stat_compare_means(method = "t.test")
#change position of stat results
p + stat_compare_means(method = "t.test", label.x = 1.5, label.y = 3000)
#one-sample t test
t.test(Colocalization ~ Group, data=data_synapse, var.equal = TRUE, conf.level=0.95)
##
## Two Sample t-test
##
## data: Colocalization by Group
## t = 1.5381, df = 28, p-value = 0.1352
## alternative hypothesis: true difference in means between group WT and group APP is not equal to 0
## 95 percent confidence interval:
## -104.1445 732.0112
## sample estimates:
## mean in group WT mean in group APP
## 1239.2000 925.2667
#another way to do t-test, using Holm method for multiple comparison correction
with(data_synapse, pairwise.t.test(Colocalization, Group, p.adj = "holm"))
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Colocalization and Group
##
## WT
## APP 0.14
##
## P value adjustment method: holm
#Wilcoxon Rank sum test - non-parametric two sample test
wilcox.test(data_synapse$Colocalization ~ data_synapse$Group, paired=FALSE)
##
## Wilcoxon rank sum exact test
##
## data: data_synapse$Colocalization by data_synapse$Group
## W = 144, p-value = 0.2017
## alternative hypothesis: true location shift is not equal to 0
#One-way ANOVE - for comparison of means between 3 or more groups - same as t-test, but can be used when you have more than 2 levels
oneway.test(Colocalization ~ Group, data = data_synapse, var.equal = TRUE)
##
## One-way analysis of means
##
## data: Colocalization and Group
## F = 2.3659, num df = 1, denom df = 28, p-value = 0.1352
#output (structure) of all the results
result <-oneway.test(Colocalization ~ Group, data = data_synapse, var.equal = TRUE)
str(result)
## List of 5
## $ statistic: Named num 2.37
## ..- attr(*, "names")= chr "F"
## $ parameter: Named num [1:2] 1 28
## ..- attr(*, "names")= chr [1:2] "num df" "denom df"
## $ p.value : num 0.135
## $ method : chr "One-way analysis of means"
## $ data.name: chr "Colocalization and Group"
## - attr(*, "class")= chr "htest"
#extract specific output from results (p.value in this case)
result$p.value
## [1] 0.1352392
Bonferroni or Tukey:
Benjamini and Hochberg (BH or FDR)
# Two-way interaction plot. The slopes will be different if there is an interaction between Group and Pair
with(data_synapse, interaction.plot(x.factor = Group, trace.factor = Pair,
response = Colocalization, fun = mean,
type = "b", legend = TRUE,
xlab = "Group", ylab="Colocalization"))
# Two-way ANOVA,with an interaction term (Do Group and Pair interact to influence Colocalization?)
aov <- (with(data_synapse, aov(Colocalization ~ Group*Pair))) #alternatively use 'lm' instead of 'aov'
summary(Anova(aov))
## Sum Sq Df F value Pr(>F)
## Min. : 739156 Min. : 1.00 Min. :1.877 Min. :0.001068
## 1st Qu.:1071665 1st Qu.: 3.25 1st Qu.:3.286 1st Qu.:0.021794
## Median :2165901 Median : 4.00 Median :4.694 Median :0.042519
## Mean :2371750 Mean : 7.25 Mean :4.528 Mean :0.065818
## 3rd Qu.:3465986 3rd Qu.: 8.00 3rd Qu.:5.853 3rd Qu.:0.098192
## Max. :4416041 Max. :20.00 Max. :7.011 Max. :0.153866
## NA's :1 NA's :1
Anova(aov)
## Anova Table (Type II tests)
##
## Response: Colocalization
## Sum Sq Df F value Pr(>F)
## Group 739156 1 4.6941 0.042519 *
## Pair 4416041 4 7.0111 0.001068 **
## Group:Pair 1182502 4 1.8774 0.153866
## Residuals 3149301 20
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov, which = "Pair", conf.level = 0.95)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Colocalization ~ Group * Pair)
##
## $Pair
## diff lwr upr p adj
## 2-1 847.8333 162.2703 1533.39637 0.0109802
## 3-1 325.1667 -360.3964 1010.72970 0.6230043
## 4-1 -8.5000 -694.0630 677.06303 0.9999995
## 5-1 -268.3333 -953.8964 417.22970 0.7671136
## 3-2 -522.6667 -1208.2297 162.89637 0.1919523
## 4-2 -856.3333 -1541.8964 -170.77030 0.0101145
## 5-2 -1116.1667 -1801.7297 -430.60363 0.0007868
## 4-3 -333.6667 -1019.2297 351.89637 0.6005585
## 5-3 -593.5000 -1279.0630 92.06303 0.1102282
## 5-4 -259.8333 -945.3964 425.72970 0.7868964
data_plaque<- read.csv("Plaque_associated_gpc5.csv", header = TRUE)
data_plaque <- na.omit(data_plaque)
data_plaque$Group <- as.factor(data_plaque$Group)
data_plaque$Plaques <- as.factor(data_plaque$Plaques)
data_plaque$Area2 <- data_plaque$Area + 1
summary(data_plaque)
## Group Plaques Group2 Pair Area
## AD:2247 1:3928 Length:4088 Min. :3.000 Min. : 0.00000
## WT:1841 2: 160 Class :character 1st Qu.:3.000 1st Qu.: 0.09771
## Mode :character Median :4.000 Median : 0.48850
## Mean :4.355 Mean : 1.36284
## 3rd Qu.:6.000 3rd Qu.: 1.95420
## Max. :6.000 Max. :16.61060
## Area2
## Min. : 1.000
## 1st Qu.: 1.098
## Median : 1.488
## Mean : 2.363
## 3rd Qu.: 2.954
## Max. :17.611
with(data_plaque, tapply(Area, Group2, function(x) {
sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))
## AD_noplaques AD_plaques WT
## "M (SD) = 1.56 (2.22)" "M (SD) = 1.12 (1.61)" "M (SD) = 1.16 (1.61)"
# Density plots with semi-transparent fill
p1 <- ggplot(data_plaque[data_plaque$Group2 %in% c("AD_plaques"),], aes(x=log(Area2), fill=Group2)) +
geom_density(alpha = 0.3) +
theme_classic()
p1 + geom_density(data = data_plaque[data_plaque$Group2 %in% c("AD_noplaques"),], aes(x = log(Area2)), alpha = 0.3)
data_plaque %>%
ggplot(aes(x=Group2, y=Area, fill=Group2)) +
geom_violin() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_classic() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("A boxplot with jitter") +
xlab("")
#Basic t test
t.test(Area ~ Group, data=data_plaque[data_plaque$Group %in% c("WT","AD"),])
##
## Welch Two Sample t-test
##
## data: Area by Group
## t = 6.1494, df = 4039.3, p-value = 8.532e-10
## alternative hypothesis: true difference in means between group AD and group WT is not equal to 0
## 95 percent confidence interval:
## 0.2488268 0.4817492
## sample estimates:
## mean in group AD mean in group WT
## 1.527340 1.162052
#ANOVA
lm <- (with(data_plaque, lm(Area ~ Group)))
Anova(lm)
## Anova Table (Type II tests)
##
## Response: Area
## Sum Sq Df F value Pr(>F)
## Group 135 1 35.63 2.589e-09 ***
## Residuals 15485 4086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
oneway.test(Area ~ Group, data = data_plaque)
##
## One-way analysis of means (not assuming equal variances)
##
## data: Area and Group
## F = 37.815, num df = 1.0, denom df = 4039.3, p-value = 8.532e-10
#Other example tests
#ks.test(AD_noplaques.df$Area, AD_plaques.df$Area)
#wilcox.test(AD_noplaques.df$Area,AD_plaques.df$Area,paired=FALSE) # where y1 and y2 are numeric
#summary(m1 <- glm(Colocalization ~ Group, family="poisson", data=data))